#------------------------------------------------------------------------------
# generate daily regression data by combining with demographic data with
# 0. cancer / palliative care
# 1. pain
# 2. opioids
# 3. therapy 
# 4. opioid strength indicators
#==============================================================================

load_library = c('bit64','data.table','fst','future.apply','stringr','logger','vroom')
invisible(lapply(load_library, function(x) library(x, character.only=TRUE, quietly= TRUE)))

bucket = '/N/project/iuni_doctorshopping'

# read arguments from Snakefile
args=commandArgs(TRUE)
infile_mbr = args[[1]]
infile_geo = args[[2]]
infile_pain = args[[3]]
infile_opioids = args[[4]]
infile_therapy = args[[5]]
infile_indicator_opioid = args[[6]]
infile_opioid_therapy = args[[7]]
infile_backpain = args[[8]]
infile_neckpain = args[[9]]
infile_limbpain = args[[10]]
outfile = args[[11]]

logger::log_info('now reading data ')

mbr = read_fst(infile_mbr, as.data.table = TRUE)
mbr_geo = fread(infile_geo)

# now reading other patient list files
pain = fread(infile_pain)
backpain = fread(infile_backpain)
neckpain = fread(infile_neckpain)
limbpain = fread(infile_limbpain)
opioids = fread(infile_opioids)
therapy = fread(infile_therapy)
op_indicator = fread(infile_indicator_opioid)
op_therapy = fread(infile_opioid_therapy)

mbr = unique(mbr[, c('PATID','GDR_CD','YRDOB')], by='PATID')
mbr[, female := GDR_CD]
mbr[GDR_CD %in% c('F','M'), female := ifelse(GDR_CD == 'F', 1L, 0L)]
mbr[,GDR_CD := NULL]
mbr[,YRDOB := as.integer(YRDOB)]

merge_by_patid = function(dfA, dfB, dfB_name){
	if (nrow(dfA) == 0) {
		stop('dfA is empty')
	} else if (nrow(dfB) == 0){
		message('no merge because ', dfB_name,' is empty')
		return(dfA)
	} else {
		message('now merging with ', dfB_name)
		if (class(dfA$PATID) != 'integer64') {
			dfA[, PATID := as.integer64(PATID)]	
		} 
		if (class(dfB$PATID) != 'integer64') {
			dfB[, PATID := as.integer64(PATID)]
		} 

		if (dfB_name == 'therapy') {
			dfB = unique(dfB[, 'PATID'])
		}

		if (dfB_name %in% c('mbr','indicator')){
			dfA = merge(dfA, dfB,  by='PATID', all.x=TRUE)		
		} else {
			dfB[, (dfB_name) := 1L]
			dfA = merge(dfA, dfB,  by='PATID', all.x=TRUE)		
			dfA[is.na(get(dfB_name)), (dfB_name) := 0L]
		}
		
		return(dfA)
	}
}

udata = merge_by_patid(mbr_geo, mbr, 'mbr')
udata = merge_by_patid(udata, pain, 'pain')
udata = merge_by_patid(udata, backpain, 'backpain')
udata = merge_by_patid(udata, neckpain, 'neckpain')
udata = merge_by_patid(udata, limbpain, 'limbpain')
udata = merge_by_patid(udata, opioids, 'opioids')
udata = merge_by_patid(udata, therapy, 'therapy')
udata = merge_by_patid(udata, op_indicator, 'indicator')

op_therapy[, PATID := as.integer64(PATID)]
udata = merge(udata, op_therapy, by='PATID',all.x=TRUE)

logger::log_info('now writing data ', outfile)

write_fst(udata, outfile, 100)
